/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.app.mediafilter;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;

import org.apache.log4j.Logger;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Utils;

/**
 * Text MediaFilter for PDF sources
 *
 * This filter produces extracted text suitable for building an index,
 * but not for display to end users.
 * It forks a process running the "pdftotext" program from the
 * XPdf suite -- see http://www.foolabs.com/xpdf/
 * This is a suite of open-source PDF tools that has been widely ported
 * to Unix platforms and the ones we use (pdftoppm, pdftotext) even
 * run on Win32.
 *
 * This was written for the FACADE project but it is not directly connected
 * to any of the other FACADE-specific software.  The FACADE UI expects
 * to find thumbnail images for 3D PDFs generated by this filter.
 *
 * Requires DSpace config properties keys:
 *
 *  xpdf.path.pdftotext -- path to "pdftotext" executable (required!)
 *
 * @author Larry Stone
 * @see org.dspace.app.mediafilter.MediaFilter
 */
public class XPDF2Text extends MediaFilter
{
    private static Logger log = Logger.getLogger(XPDF2Text.class);

    // Command to get text from pdf; @infile@, @COMMAND@ are placeholders
    private static final String XPDF_PDFTOTEXT_COMMAND[] =
    {
        "@COMMAND@", "-q", "-enc", "UTF-8", "@infile@", "-"
    };


    // executable path that comes from DSpace config at runtime.
    private String pdftotextPath = null;

    public String getFilteredName(String oldFilename)
    {
        return oldFilename + ".txt";
    }

    public String getBundleName()
    {
        return "TEXT";
    }

    public String getFormatString()
    {
        return "Text";
    }

    public String getDescription()
    {
        return "Extracted Text";
    }

    public InputStream getDestinationStream(InputStream sourceStream)
            throws Exception
    {
        // get configured value for path to XPDF command:
        if (pdftotextPath == null)
        {
            pdftotextPath = ConfigurationManager.getProperty("xpdf.path.pdftotext");
            if (pdftotextPath == null)
            {
                throw new IllegalStateException("No value for key \"xpdf.path.pdftotext\" in DSpace configuration!  Should be path to XPDF pdftotext executable.");
            }
        }

        File sourceTmp = File.createTempFile("DSfilt",".pdf");
        sourceTmp.deleteOnExit();  // extra insurance, we'll delete it here.
        int status = -1;
        try
        {
            // make local temp copy of source PDF since PDF tools
            // require a file for random access.
            // XXX fixme could optimize if we ever get an interface to grab asset *files*
            OutputStream sto = new FileOutputStream(sourceTmp);
            Utils.copy(sourceStream, sto);
            sto.close();
            sourceStream.close();

            String pdfCmd[] = XPDF_PDFTOTEXT_COMMAND.clone();
            pdfCmd[0] = pdftotextPath;
            pdfCmd[4] = sourceTmp.toString();

            log.debug("Running command: "+Arrays.deepToString(pdfCmd));
            Process pdfProc = Runtime.getRuntime().exec(pdfCmd);
            InputStream stdout = pdfProc.getInputStream();
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            Utils.copy(new BufferedInputStream(stdout), baos);
            stdout.close();
            baos.close();

            status = pdfProc.waitFor();
            String msg = null;
            if (status == 1)
            {
                msg = "pdftotext failed opening input: file=" + sourceTmp.toString();
            }
            else if (status == 3)
            {
                msg = "pdftotext permission failure (perhaps copying of text from this document is not allowed - check PDF file's internal permissions): file=" + sourceTmp.toString();
            }
            else if (status != 0)
            {
                msg = "pdftotext failed, maybe corrupt PDF? status=" + String.valueOf(status);
            }

            if (msg != null)
            {
                log.error(msg);
                throw new IOException(msg);
            }

            return new ByteArrayInputStream(baos.toByteArray());
        }
        catch (InterruptedException e)
        {
            log.error("Failed in pdftotext subprocess: ",e);
            throw e;
        }
        finally
        {
            if (!sourceTmp.delete())
            {
                log.error("Unable to delete temporary file");
            }
            if (status != 0)
            {
                log.error("PDF conversion proc failed, returns=" + status + ", file=" + sourceTmp);
            }
        }
    }
}

 	  	 
